/**
 * @FileName: ChineseUtils.java
 * @Package: util
 * @author youshipeng
 * @created 2016/11/6 12:50
 * <p>
 * Copyright 2016 ziroom
 */
package util;

import util.enums.LexemeType;
import util.model.ChineseSentence;
import util.model.Lexeme;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static util.CharacterUtil.isChinese;

/**
 * <p></p>
 *
 * <PRE>
 * <BR>	修改记录
 * <BR>-----------------------------------------------
 * <BR>	修改日期			修改人			修改内容
 * </PRE>
 *
 * @author youshipeng
 * @since 1.0
 * @version 1.0
 */
public final class ChineseUtils {

    private ChineseUtils() {}

    //疑似拼音正则表达式
    private static final String SUSPECTED_PINYIN_REGEX  = "[^aoeiuv]?h?[iuv]?(ai|ei|ao|ou|er|ang?|eng?|ong|a|o|e|i|u|ng|n)?";
    private static final String SUSPECTED_PINYIN_REGEX1  = "[\\u4e00-\\u9fa5]|[^aoeiuv]?h?[iuv]?(ai|ei|ao|ou|er|ang?|eng?|ong|a|o|e|i|u|ng|n)?";
    private static final String SUSPECTED_PINYIN_REGEX2  = "[\\u4e00-\\u9fa5]|(sh|ch|zh|[^aoeiuv])?[iuv]?(ai|ei|ao|ou|er|ang?|eng?|ong|a|o|e|i|u|ng|n)?";

    public static ChineseSentence encapsulation(String input) {
        final String standard = CharacterUtil.regularize(input);
        ChineseSentence sentence = new ChineseSentence();
        sentence.setContent(input);
        sentence.setSentenceUnits(new ArrayList<Lexeme>(){{
            String target = standard;
            for (int i = target.length(), tag = 0; i > 0; i = i - tag) {
                Pattern pat = Pattern.compile(SUSPECTED_PINYIN_REGEX2);
                Matcher matcher = pat.matcher(target);
                if (!matcher.find()) {
                    break;
                }
                String part = matcher.group();
                addAll(buildLexeme(part));
                tag = matcher.end() - matcher.start();
                target = target.substring(tag);
            }
        }});
        return sentence;
    }

    private static List<Lexeme> buildLexeme(final String content) {
        List<Lexeme> results = new ArrayList<Lexeme>(){{
            if (!Utils.isEmpty(content)) {
                if (isChinese(content.charAt(0))) {
                    add(new Lexeme(content, LexemeType.CHINESE));
                } else if(PinyinDictionary.getInstance().contains(content)) {
                    add(new Lexeme(content, LexemeType.WHOLE));
                } else {
                    for (int i = 0; i < content.length(); i++) {
                        add(new Lexeme(String.valueOf(content.charAt(i)), LexemeType.ACRONYM));
                    }
                }
            }
        }};

        return results;
    }
}